knitr::opts_chunk$set(dpi = 300, fig.path = 'fig/', fig.width = 12, fig.height = 8)
library(tidymodels)
library(modeltime.h2o)
library(tidyverse)
library(timetk)
library(readxl)
library(readr)
cv <- read_excel("data/cv.xlsx") %>% mutate(Date = as.Date(Date))
cv2 <- cv
d1 <- read_delim("data/fb1", "\t", escape_double = FALSE, trim_ws = TRUE)
[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------[39m
cols(
ds = [34mcol_date(format = "")[39m,
country = [31mcol_character()[39m,
polygon_source = [31mcol_character()[39m,
polygon_id = [31mcol_character()[39m,
polygon_name = [31mcol_character()[39m,
all_day_bing_tiles_visited_relative_change = [32mcol_double()[39m,
all_day_ratio_single_tile_users = [32mcol_double()[39m,
baseline_name = [31mcol_character()[39m,
baseline_type = [31mcol_character()[39m
)
d2 <- read_delim("data/fb2", "\t", escape_double = FALSE, trim_ws = TRUE)
[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------[39m
cols(
ds = [34mcol_date(format = "")[39m,
country = [31mcol_character()[39m,
polygon_source = [31mcol_character()[39m,
polygon_id = [31mcol_character()[39m,
polygon_name = [31mcol_character()[39m,
all_day_bing_tiles_visited_relative_change = [32mcol_double()[39m,
all_day_ratio_single_tile_users = [32mcol_double()[39m,
baseline_name = [31mcol_character()[39m,
baseline_type = [31mcol_character()[39m
)
lk <- d1 %>% filter(country == 'LKA')
lk <- lk %>% rbind(d2 %>% filter(country == 'LKA')) %>% group_by(ds) %>% summarise(adb = mean(all_day_bing_tiles_visited_relative_change), ar = mean(all_day_ratio_single_tile_users)) %>% rename(Date = ds)
cv <- cv %>% left_join(lk) %>% tk_augment_lags(.,c('adb', 'ar'), .lags = seq(1,20,by = 4))
Joining, by = "Date"
Data splitting - The most recent 1 month as the assessment split
cv %>% plot_time_series(.date_var = Date, n)
splits <- time_series_split(cv, assess = "45 day", cumulative = TRUE)
Using date_var: Date
splits2 <- time_series_split(cv2, assess = "45 day", cumulative = TRUE)
Using date_var: Date
recipe_spec <- recipe(n ~ ., data = training(splits)) %>%
step_timeseries_signature(Date)
recipe_spec2 <- recipe(n ~ ., data = training(splits2)) %>%
step_timeseries_signature(Date)
train_tbl <- training(splits) %>% bake(prep(recipe_spec), .)
test_tbl <- testing(splits) %>% bake(prep(recipe_spec), .)
train_tbl2 <- training(splits2) %>% bake(prep(recipe_spec), .)
test_tbl2 <- testing(splits2) %>% bake(prep(recipe_spec), .)
h2o.init(
nthreads = -1,
ip = 'localhost',
port = 54321
)
Connection successful!
R is connected to the H2O cluster:
H2O cluster uptime: 10 hours 50 minutes
H2O cluster timezone: Asia/Colombo
H2O data parsing timezone: UTC
H2O cluster version: 3.32.1.5
H2O cluster version age: 10 days
H2O cluster name: H2O_started_from_R_Supun_aww077
H2O cluster total nodes: 1
H2O cluster total memory: 0.77 GB
H2O cluster total cores: 4
H2O cluster allowed cores: 4
H2O cluster healthy: TRUE
H2O Connection ip: localhost
H2O Connection port: 54321
H2O Connection proxy: NA
H2O Internal Security: FALSE
H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
R Version: R version 3.6.3 (2020-02-29)
model_spec <- automl_reg(mode = 'regression') %>%
set_engine(
engine = 'h2o',
max_runtime_secs = 30,
max_runtime_secs_per_model = 10,
max_models = 3,
nfolds = 5,
exclude_algos = c(),
verbosity = 1,
seed = 786
)
model_spec
H2O AutoML Model Specification (regression)
Engine-Specific Arguments:
max_runtime_secs = 30
max_runtime_secs_per_model = 10
max_models = 3
nfolds = 5
exclude_algos = c()
verbosity = 1
seed = 786
Computational engine: h2o
model_fitted <- model_spec %>%
fit(n ~ ., data = train_tbl)
Converting to H2OFrame...
|
| | 0%
|
|=================================================================================| 100%
Training H2O AutoML...
|
| | 0%
|
|==== | 5%
|
|======= | 8%
|
|========== | 12%
|
|============ | 15%
|
|=============== | 19%
|
|================== | 22%
|
|=================================================================================| 100%
|
| | 0%
|
|==========================================================================================| 100%
Leaderboard:
[5 rows x 6 columns]
Using top model: StackedEnsemble_BestOfFamily_AutoML_20210815_183318
View(cv)
model_fitted2 <- model_spec %>%
fit(n ~ ., data = train_tbl2)
Converting to H2OFrame...
|
| | 0%
|
|==========================================================================================| 100%
Training H2O AutoML...
|
| | 0%
|
|==== | 5%
|
|======== | 8%
|
|=========== | 12%
|
|============== | 15%
|
|================= | 19%
|
|==================== | 22%
|
|==========================================================================================| 100%
|
| | 0%
|
|==========================================================================================| 100%
Leaderboard:
[5 rows x 6 columns]
Using top model: StackedEnsemble_BestOfFamily_AutoML_20210815_183342
model_fitted2
parsnip model object
Fit time: 24.6s
H2O AutoML - Stackedensemble
--------
Model: Model Details:
==============
H2ORegressionModel: stackedensemble
Model ID: StackedEnsemble_BestOfFamily_AutoML_20210815_183342
Number of Base Models: 3
Base Models (count by algorithm type):
drf gbm glm
1 1 1
Metalearner:
Metalearner algorithm: glm
Metalearner cross-validation fold assignment:
Fold assignment scheme: AUTO
Number of folds: 5
Fold column: NULL
Metalearner hyperparameters:
H2ORegressionMetrics: stackedensemble
** Reported on training data. **
MSE: 633.7107
RMSE: 25.17361
MAE: 15.46799
RMSLE: 0.4973546
Mean Residual Deviance : 633.7107
H2ORegressionMetrics: stackedensemble
** Reported on cross-validation data. **
** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
MSE: 9218.817
RMSE: 96.01467
MAE: 49.08651
RMSLE: 0.7351128
Mean Residual Deviance : 9218.817
model_fitted
parsnip model object
Fit time: 21.7s
H2O AutoML - Stackedensemble
--------
Model: Model Details:
==============
H2ORegressionModel: stackedensemble
Model ID: StackedEnsemble_BestOfFamily_AutoML_20210815_183318
Number of Base Models: 3
Base Models (count by algorithm type):
drf gbm glm
1 1 1
Metalearner:
Metalearner algorithm: glm
Metalearner cross-validation fold assignment:
Fold assignment scheme: AUTO
Number of folds: 5
Fold column: NULL
Metalearner hyperparameters:
H2ORegressionMetrics: stackedensemble
** Reported on training data. **
MSE: 967.0529
RMSE: 31.09747
MAE: 19.61695
RMSLE: 0.4347607
Mean Residual Deviance : 967.0529
H2ORegressionMetrics: stackedensemble
** Reported on cross-validation data. **
** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
MSE: 10962.94
RMSE: 104.7041
MAE: 58.95527
RMSLE: NaN
Mean Residual Deviance : 10962.94
modeltime_tbl <- modeltime_table(
model_fitted2
)
modeltime_tbl %>%
modeltime_calibrate(test_tbl2) %>%
modeltime_forecast(
new_data = test_tbl2,
actual_data = cv2,
keep_data = TRUE
) %>%
plot_modeltime_forecast(
.interactive = TRUE
)
Converting to H2OFrame...
|
| | 0%
|
|=================================================================================| 100%
|
| | 0%
|
|=================================================================================| 100%
Converting to H2OFrame...
|
| | 0%
|
|=================================================================================| 100%
|
| | 0%
|
|=================================================================================| 100%
data_prepared_tbl <- bind_rows(train_tbl2, test_tbl2)
future_tbl <- data_prepared_tbl %>%
future_frame(.length_out = "9 month")
.date_var is missing. Using: Date
future_prepared_tbl <- bake(prep(recipe_spec), future_tbl)
refit_tbl <- modeltime_tbl %>%
modeltime_refit(data_prepared_tbl)
Converting to H2OFrame...
|
| | 0%
|
|=================================================================================| 100%
Training H2O AutoML...
|
| | 0%
|
|==== | 5%
|
|======= | 9%
|
|========== | 12%
|
|============= | 16%
|
|================ | 19%
|
|=================== | 23%
|
|=================================================================================| 100%
|
| | 0%
|
|=================================================================================| 100%
Leaderboard:
[5 rows x 6 columns]
Using top model: StackedEnsemble_BestOfFamily_AutoML_20210815_183414
refit_tbl %>%
modeltime_forecast(
new_data = future_prepared_tbl,
actual_data = data_prepared_tbl,
keep_data = TRUE
) %>%
plot_modeltime_forecast(
.interactive = TRUE,.conf_interval_show = TRUE
)
Converting to H2OFrame...
|
| | 0%
|
|=================================================================================| 100%
|
| | 0%
|
|=================================================================================| 100%
Expecting the following names to be in the data frame: .conf_hi, .conf_lo.
Proceeding with '.conf_interval_show = FALSE' to visualize the forecast without confidence intervals.
Alternatively, try using `modeltime_calibrate()` before forecasting to add confidence intervals.
modeltime_tbl <- modeltime_table(
model_fitted
)
modeltime_tbl %>%
modeltime_calibrate(test_tbl) %>%
modeltime_forecast(
new_data = test_tbl,
actual_data = cv,
keep_data = TRUE
) %>%
plot_modeltime_forecast(
.interactive = TRUE
)
Converting to H2OFrame...
|
| | 0%
|
|=================================================================================| 100%
|
| | 0%
|
|=================================================================================| 100%
Converting to H2OFrame...
|
| | 0%
|
|=================================================================================| 100%
|
| | 0%
|
|=================================================================================| 100%
data_prepared_tbl <- bind_rows(train_tbl, test_tbl)
refit_tbl <- modeltime_tbl %>%
modeltime_refit(data_prepared_tbl)
Converting to H2OFrame...
|
| | 0%
|
|=================================================================================| 100%
Training H2O AutoML...
|
| | 0%
|
|= | 1%
|
|==== | 5%
|
|======= | 8%
|
|========== | 12%
|
|============= | 16%
|
|=============== | 19%
|
|================== | 22%
|
|=================================================================================| 100%
|
| | 0%
|
|=================================================================================| 100%
Leaderboard:
[5 rows x 6 columns]
Using top model: StackedEnsemble_BestOfFamily_AutoML_20210815_183444
future_tbl <- data_prepared_tbl %>%
future_frame(.length_out = "9 month")
.date_var is missing. Using: Date
future_prepared_tbl <- bake(prep(recipe_spec), future_tbl) %>% mutate(adb = -0.4, ar = 0.3) %>% tk_augment_lags(.,c('adb', 'ar'), .lags = seq(1,20,by = 4))
refit_tbl %>%
modeltime_forecast(
new_data = future_prepared_tbl,
actual_data = data_prepared_tbl,
keep_data = TRUE
) %>%
plot_modeltime_forecast(
.interactive = TRUE,.conf_interval_show = TRUE
)
Converting to H2OFrame...
|
| | 0%
|
|=================================================================================| 100%
|
| | 0%
|
|=================================================================================| 100%
Expecting the following names to be in the data frame: .conf_hi, .conf_lo.
Proceeding with '.conf_interval_show = FALSE' to visualize the forecast without confidence intervals.
Alternatively, try using `modeltime_calibrate()` before forecasting to add confidence intervals.
future_prepared_tbl <- bake(prep(recipe_spec), future_tbl) %>% mutate(adb = 0, ar = 0) %>% tk_augment_lags(.,c('adb', 'ar'), .lags = seq(1,20,by = 4))
refit_tbl %>%
modeltime_forecast(
new_data = future_prepared_tbl,
actual_data = data_prepared_tbl,
keep_data = TRUE
) %>%
plot_modeltime_forecast(
.interactive = TRUE,.conf_interval_show = TRUE
)
Converting to H2OFrame...
|
| | 0%
|
|=================================================================================| 100%
|
| | 0%
|
|=================================================================================| 100%
Expecting the following names to be in the data frame: .conf_hi, .conf_lo.
Proceeding with '.conf_interval_show = FALSE' to visualize the forecast without confidence intervals.
Alternatively, try using `modeltime_calibrate()` before forecasting to add confidence intervals.